zinbModelFromW <- function(counts, sim_W){
  mod <- model.matrix(~ sim_W)
  zinb_sim <- zinbFit(counts, ncores = 3, X=mod, which_X_mu=1:3, which_X_pi=1:3,
                      commondispersion=FALSE)
  true_alpha_mu <- zinb_sim@beta_mu[-1,]
  true_alpha_pi <- zinb_sim@beta_pi[-1,]
  true_beta_mu <- zinb_sim@beta_mu[1,,drop=FALSE]
  true_beta_pi <- zinb_sim@beta_pi[1,,drop=FALSE]
  true_gamma_mu <- zinb_sim@gamma_mu
  true_gamma_pi <- zinb_sim@gamma_pi
  true_zeta <- zinb_sim@zeta
  zinbModel(W=sim_W, alpha_mu=true_alpha_mu, alpha_pi=true_alpha_pi,
            beta_mu=true_beta_mu, beta_pi=true_beta_pi,
            zeta = true_zeta, gamma_mu=true_gamma_mu,
            gamma_pi=true_gamma_pi)
}

Simulation plan

simSummary = data.frame(realData = rep('Allen', 3),
                        K = rep(2, 3),
                        nclusters = rep(3, 3),
                        corMuPi = rep('no', 3),
                        dim =  rep('1000x100', 3),
                        zeroFract = c(.25, .5, .75),
                        nreplicates = 10)
kable(simSummary)                    
realData K nclusters corMuPi dim zeroFract nreplicates
Allen 2 3 no 1000x100 0.25 10
Allen 2 3 no 1000x100 0.50 10
Allen 2 3 no 1000x100 0.75 10

To simulate a dataset, steps are as follow

  1. Filter out the genes that do not have at least 5 counts in at least 5 cells.
  2. Sample at random 200 cells.
  3. Sort genes by decreasing variance.
  4. Create bins of genes according to percentage of zeros. For example, genes with percentage of zeros between <20%, 20%-60%, >60%.
  5. Select genes with appropriate percentage of zeros to get wanted percentage of zeros in whole dataset. First genes selected have the highest variance.
  6. Fit zinb, get W, plot W.
  7. Filter out cells not clearly in a cluster.
  8. Sample at random 100 cells from remaining cells.
  9. Re-fit zinb, get W, simulate W from gaussians (which becomes our trueW).
  10. Get true \(\gamma_{\mu}\), \(\gamma_{\pi}\), \(\alpha_{\mu}\), \(\alpha_{\pi}\), \(\beta_{\mu}\), \(\beta_{\pi}\), and \(\zeta\) by fitting with genewise dispersion \[ln(\mu) = (1, trueW) (\beta_{\mu}, \alpha_{\mu}) + (V\gamma_{\mu})^T,\] \[\text{logit}(\pi) = (1, trueW) (\beta_{\pi}, \alpha_{\pi}) + (V\gamma_{\pi})^T.\]
  11. Create zinb model = zinbModel(trueW, \(\hat{\beta_{\mu}}\),\(\hat{\beta_{\pi}}\), \(\hat{\gamma_{\mu}}\), \(\hat{\gamma_{\pi}}\), \(\hat{\alpha_{\mu}}\), \(\hat{\alpha_{\pi}}\), \(\zeta\)).
  12. Simulate counts using zinbSim(model).

Using this procedure, we have the simulated counts and true \(W\), \(\gamma_{\mu}\), \(\gamma_{\pi}\), \(\alpha_{\mu}\), \(\alpha_{\pi}\), \(\beta_{\mu}\), \(\beta_{\pi}\), and \(\zeta\).

To simulate the three datasets from Allen real dataset, the same 100 cells were used.

ZINB model

Select cells

data("allen")

prefilter <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
                    which(colData(allen)$Core.Type=="Core")]
sampleCells <- sample(ncol(prefilter), 200)
postfilter <- prefilter[, sampleCells]
filterGenes <- apply(assay(postfilter) > 5, 1, sum) >= 5
postfilter <- postfilter[filterGenes, ]

bio = bioIni =  as.factor(colData(postfilter)$driver_1_s)
cl <- as.factor(colData(postfilter)$Primary.Type)
postfilter <- assay(postfilter)

vars <- rowVars(log1p(postfilter))
names(vars) <- rownames(postfilter)
vars <- sort(vars, decreasing = TRUE)
pZero <- rowMeans(log1p(postfilter) == 0)
names(pZero) <- rownames(postfilter)
pZero = pZero[names(vars)]
chosenGenes = c(names(pZero)[pZero > 0.75][1:100],
                names(pZero)[pZero<0.75 & pZero>0.6][1:400],
                names(pZero)[pZero<0.6 & pZero>0.2][1:400],
                names(pZero)[pZero<0.2][1:100])

core <- postfilter[chosenGenes, ]
#sum(core == 0)/(nrow(core) * ncol(core))

The dimensions are

dim(core)
## [1] 1000  200
par(mfrow = c(1,2))
fq <- betweenLaneNormalization(core, which="full")
pca <- prcomp(t(log1p(fq)))
plot(pca$x, col=cols[bio], pch=19, main="PCA of log-counts, 200 cells")
#legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts, 200 cells")

Fit data with K = 2, V and X intercepts in both Mu and Pi, commondispersion = FALSE, and no covariate.

print(system.time(zinb <- zinbFit(core, ncores = 3, K = 2,
                                  commondispersion = FALSE)))
##    user  system elapsed 
## 367.619 217.622 231.034

100 cells are selected. Now, the dimensions are

estW = zinb@W
filt1 = estW[,1] > -1 & bio == 'Rbp4-Cre_KL100'
estW = estW[!filt1, ]
bio = bio[!filt1]
core = core[,!filt1]

filt2 = estW[,2] < 0 & bio == 'Scnn1a-Tg3-Cre'
estW = estW[!filt2, ]
bio = bio[!filt2]
core = core[,!filt2]

chosen = sample(nrow(estW), 100, replace = F)
bio = bio[chosen]
estW = estW[chosen, ]
core5 = core[,chosen]
dim(core5)
## [1] 1000  100
chosenCells = colnames(core5)
detection_rate <- colSums(core5>0)
coverage <- colSums(core5)
print(system.time(zinb5 <- zinbFit(core5, ncores = 3, K = 2,
                                  commondispersion = FALSE)))
##    user  system elapsed 
## 243.519 130.365 154.869
par(mfrow = c(1,3))
plot(zinb@W, col=cols[bioIni], pch=19, xlab="W1", ylab="W2",
     main="Fitted W, 200 cells")
plot(estW, col=cols[bio], pch=19, xlab="W1", ylab="W2",
     main="Fitted W, 100 cells")
plot(zinb5@W, col=cols[bio], pch=19, xlab="W1", ylab="W2",
     main="Fitted W, 100 cells, refit")

Simulate true parameters

50%

W

zinb = zinb5
W = data.frame(W1 = zinb@W[,1], W2 = zinb@W[,2], bio = bio)
ggplot(melt(W), aes(value, fill = bio)) + geom_density(alpha = 0.2) +
  facet_grid(~ variable) + xlab('fitted W') + theme_bw()

W simulated with \[W_1^{Scnn1a} \sim \mathcal{N}(1, .3)\] \[W_2^{Scnn1a} \sim \mathcal{N}(3, .3)\] \[W_1^{Rbp4} \sim \mathcal{N}(2, .3)\] \[W_2^{Rbp4} \sim \mathcal{N}(-3, .3)\] \[W_1^{Ntsr1} \sim \mathcal{N}(-4, .4)\] \[W_2^{Ntsr1} \sim \mathcal{N}(0, .4)\] TODO: change numeric values.

Sc = grepl('^Scnn1a', W$bio)
Rb = grepl('^Rbp4', W$bio)
Nt = grepl('^Ntsr1', W$bio)
W[Sc, 'W1sim'] = rnorm(sum(Sc), 1, .3)
W[Sc, 'W2sim'] = rnorm(sum(Sc), 3, .3)
W[Rb, 'W1sim'] = rnorm(sum(Rb), 2, .3)
W[Rb, 'W2sim'] = rnorm(sum(Rb), -3, .3)
W[Nt, 'W1sim'] = rnorm(sum(Nt), -4, .4)
W[Nt, 'W2sim'] = rnorm(sum(Nt), 0, .4)

par(mfrow=c(1, 2))
plot(zinb@W, col=cols[bio], pch=19, xlab="W1", ylab="W2",
     main="Fitted")
plot(W[,c('W1sim', 'W2sim')], col=cols[bio], pch=19, xlab="W1", ylab="W2",
     main="Simulated")

sim_W <- as.matrix(W[, c('W1sim', 'W2sim')])

alpha, beta, gamma

simModel = simModel5 = zinbModelFromW(core5, sim_W)
simData = simData5 = zinbSim(simModel, seed = 8746)
df <- data.frame(gamma_mu = simModel@gamma_mu[1, ],
                 gamma_pi = simModel@gamma_pi[1, ],
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                  gamma_mu   gamma_pi detection_rate   coverage
## gamma_mu        1.0000000 -0.2777558      0.3361947  0.8805161
## gamma_pi       -0.2777558  1.0000000     -0.9845389 -0.5059706
## detection_rate  0.3361947 -0.9845389      1.0000000  0.5370054
## coverage        0.8805161 -0.5059706      0.5370054  1.0000000
df <- data.frame(alpha_mu_1 = simModel@alpha_mu[1, ],
                 alpha_mu_2 = simModel@alpha_mu[2, ],
                 alpha_pi_1 = simModel@alpha_pi[1, ],
                 alpha_pi_2 = simModel@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')

print(cor(df, method="spearman"))
##             alpha_mu_1 alpha_mu_2  alpha_pi_1  alpha_pi_2
## alpha_mu_1  1.00000000  0.1434140 -0.28012445 -0.03256204
## alpha_mu_2  0.14341402  1.0000000 -0.12087510 -0.39915616
## alpha_pi_1 -0.28012445 -0.1208751  1.00000000  0.03098031
## alpha_pi_2 -0.03256204 -0.3991562  0.03098031  1.00000000
df <- data.frame(beta_mu = simModel@beta_mu[1, ],
                 beta_pi = simModel@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')

print(cor(df, method="spearman"))
##            beta_mu    beta_pi
## beta_mu  1.0000000 -0.4648914
## beta_pi -0.4648914  1.0000000

25%

chosenGenes = c(names(pZero)[pZero > 0.4][1:50],
                names(pZero)[pZero<0.4 & pZero>0.2][1:550],
                names(pZero)[pZero<0.2][1:400])

core25 <- postfilter[chosenGenes, chosenCells]
sum(core25 == 0)/(nrow(core25) * ncol(core25))
## [1] 0.25206
print(system.time(zinb25 <- zinbFit(core25, ncores = 3, K = 2,
                                  commondispersion = FALSE)))
##    user  system elapsed 
## 236.609 130.312 152.826

W

zinb = zinb25
W = data.frame(W1 = zinb@W[,1], W2 = zinb@W[,2], bio = bio)
ggplot(melt(W), aes(value, fill = bio)) + geom_density(alpha = 0.2) +
  facet_grid(~ variable) + xlab('fitted W') + theme_bw()

W simulated with \[W_1^{Scnn1a} \sim \mathcal{N}(1.5, .3)\] \[W_2^{Scnn1a} \sim \mathcal{N}(2, .3)\] \[W_1^{Rbp4} \sim \mathcal{N}(1, .3)\] \[W_2^{Rbp4} \sim \mathcal{N}(-3, .3)\] \[W_1^{Ntsr1} \sim \mathcal{N}(-3, .3)\] \[W_2^{Ntsr1} \sim \mathcal{N}(0, .3)\]

Sc = grepl('^Scnn1a', W$bio)
Rb = grepl('^Rbp4', W$bio)
Nt = grepl('^Ntsr1', W$bio)
W[Sc, 'W1sim'] = rnorm(sum(Sc), 1.5, .3)
W[Sc, 'W2sim'] = rnorm(sum(Sc), 2, .3)
W[Rb, 'W1sim'] = rnorm(sum(Rb), 1, .3)
W[Rb, 'W2sim'] = rnorm(sum(Rb), -3, .3)
W[Nt, 'W1sim'] = rnorm(sum(Nt), -3, .3)
W[Nt, 'W2sim'] = rnorm(sum(Nt), 0, .3)

par(mfrow=c(1, 2))
plot(zinb@W, col=cols[bio], pch=19, xlab="W1", ylab="W2",
     main="Fitted", xlim = c(-5, 3), ylim = c(-4, 3))
plot(W[,c('W1sim', 'W2sim')], col=cols[bio], pch=19, xlab="W1", ylab="W2",
     main="Simulated", xlim = c(-5, 3), ylim = c(-4, 3))

sim_W <- as.matrix(W[, c('W1sim', 'W2sim')])

alpha, beta, gamma

simModel = simModel25 = zinbModelFromW(core25, sim_W)
simData = simData25 = zinbSim(simModel, seed = 8746)
detection_rate <- colSums(core25>0)
coverage <- colSums(core25)
df <- data.frame(gamma_mu = simModel@gamma_mu[1, ],
                 gamma_pi = simModel@gamma_pi[1, ],
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                  gamma_mu   gamma_pi detection_rate   coverage
## gamma_mu        1.0000000 -0.3801500      0.4191412  0.8914011
## gamma_pi       -0.3801500  1.0000000     -0.9886676 -0.5168317
## detection_rate  0.4191412 -0.9886676      1.0000000  0.5474675
## coverage        0.8914011 -0.5168317      0.5474675  1.0000000
df <- data.frame(alpha_mu_1 = simModel@alpha_mu[1, ],
                 alpha_mu_2 = simModel@alpha_mu[2, ],
                 alpha_pi_1 = simModel@alpha_pi[1, ],
                 alpha_pi_2 = simModel@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')

print(cor(df, method="spearman"))
##             alpha_mu_1  alpha_mu_2  alpha_pi_1  alpha_pi_2
## alpha_mu_1  1.00000000 -0.03073397 -0.37139808  0.02845402
## alpha_mu_2 -0.03073397  1.00000000 -0.04157398 -0.29780065
## alpha_pi_1 -0.37139808 -0.04157398  1.00000000 -0.22350240
## alpha_pi_2  0.02845402 -0.29780065 -0.22350240  1.00000000
df <- data.frame(beta_mu = simModel@beta_mu[1, ],
                 beta_pi = simModel@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')

print(cor(df, method="spearman"))
##             beta_mu     beta_pi
## beta_mu  1.00000000 -0.08039971
## beta_pi -0.08039971  1.00000000

75%

chosenGenes = c(names(pZero)[pZero > 0.8][1:800],
                names(pZero)[pZero < 0.8 & pZero > 0.3][1:100],
                names(pZero)[pZero < 0.3][1:100])

core75 <- postfilter[chosenGenes, chosenCells]
sum(core75 == 0)/(nrow(core75) * ncol(core75))
## [1] 0.74562
print(system.time(zinb75 <- zinbFit(core75, ncores = 3, K = 2,
                                    maxiter.optimize = 100,
                                  commondispersion = FALSE)))
## Warning in simpute.als(x, J, thresh, lambda, maxit, trace.it, warm.start, :
## Convergence not achieved by 100 iterations
## Warning in optimize(f = zinb.loglik.dispersion, Y = as.matrix(Y), mu =
## mu, : NA/Inf replaced by maximum positive value

## Warning in optimize(f = zinb.loglik.dispersion, Y = as.matrix(Y), mu =
## mu, : NA/Inf replaced by maximum positive value

## Warning in optimize(f = zinb.loglik.dispersion, Y = as.matrix(Y), mu =
## mu, : NA/Inf replaced by maximum positive value

## Warning in optimize(f = zinb.loglik.dispersion, Y = as.matrix(Y), mu =
## mu, : NA/Inf replaced by maximum positive value

## Warning in optimize(f = zinb.loglik.dispersion, Y = as.matrix(Y), mu =
## mu, : NA/Inf replaced by maximum positive value

## Warning in optimize(f = zinb.loglik.dispersion, Y = as.matrix(Y), mu =
## mu, : NA/Inf replaced by maximum positive value

## Warning in optimize(f = zinb.loglik.dispersion, Y = as.matrix(Y), mu =
## mu, : NA/Inf replaced by maximum positive value
##    user  system elapsed 
## 836.387 540.840 569.358

W

zinb = zinb75
W = data.frame(W1 = zinb@W[,1], W2 = zinb@W[,2], bio = bio)
ggplot(melt(W), aes(value, fill = bio)) + geom_density(alpha = 0.2) +
  facet_grid(~ variable) + xlab('fitted W') + theme_bw()

W simulated with \[W_1^{Scnn1a} \sim \mathcal{N}(.5, .3)\] \[W_2^{Scnn1a} \sim \mathcal{N}(2, .3)\] \[W_1^{Rbp4} \sim \mathcal{N}(-4, .3)\] \[W_2^{Rbp4} \sim \mathcal{N}(-1, .3)\] \[W_1^{Ntsr1} \sim \mathcal{N}(2, .3)\] \[W_2^{Ntsr1} \sim \mathcal{N}(-3, .3)\] TODO: change numeric values.

Sc = grepl('^Scnn1a', W$bio)
Rb = grepl('^Rbp4', W$bio)
Nt = grepl('^Ntsr1', W$bio)
W[Sc, 'W1sim'] = rnorm(sum(Sc), 5, 1)
W[Sc, 'W2sim'] = rnorm(sum(Sc), 2.5, 1.5)
W[Rb, 'W1sim'] = rnorm(sum(Rb), -2.5, 1)
W[Rb, 'W2sim'] = rnorm(sum(Rb), -7.5, 1)
W[Nt, 'W1sim'] = rnorm(sum(Nt), -7.5, 1)
W[Nt, 'W2sim'] = rnorm(sum(Nt), 5, 2)

par(mfrow=c(1, 2))
plot(zinb@W, col=cols[bio], pch=19, xlab="W1", ylab="W2",
     main="Fitted", ylim = c(-10, 10), xlim = c(-15, 12))
plot(W[,c('W1sim', 'W2sim')], col=cols[bio], pch=19, xlab="W1", ylab="W2",
     main="Simulated", ylim = c(-10, 10), xlim = c(-15, 12))

sim_W <- as.matrix(W[, c('W1sim', 'W2sim')])

alpha, beta, gamma

simModel = simModel75 = zinbModelFromW(core75, sim_W)
simData = simData75 = zinbSim(simModel, seed = 8746)
detection_rate <- colSums(core75>0)
coverage <- colSums(core75)
df <- data.frame(gamma_mu = simModel@gamma_mu[1, ],
                 gamma_pi = simModel@gamma_pi[1, ],
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                  gamma_mu   gamma_pi detection_rate   coverage
## gamma_mu        1.0000000 -0.2133453      0.3033864  0.8667627
## gamma_pi       -0.2133453  1.0000000     -0.9844274 -0.5080228
## detection_rate  0.3033864 -0.9844274      1.0000000  0.5622813
## coverage        0.8667627 -0.5080228      0.5622813  1.0000000
df <- data.frame(alpha_mu_1 = simModel@alpha_mu[1, ],
                 alpha_mu_2 = simModel@alpha_mu[2, ],
                 alpha_pi_1 = simModel@alpha_pi[1, ],
                 alpha_pi_2 = simModel@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')

print(cor(df, method="spearman"))
##              alpha_mu_1  alpha_mu_2   alpha_pi_1  alpha_pi_2
## alpha_mu_1  1.000000000  0.02409092  0.008606553 -0.01618534
## alpha_mu_2  0.024090924  1.00000000 -0.043964108  0.12142476
## alpha_pi_1  0.008606553 -0.04396411  1.000000000  0.01038956
## alpha_pi_2 -0.016185340  0.12142476  0.010389562  1.00000000
df <- data.frame(beta_mu = simModel@beta_mu[1, ],
                 beta_pi = simModel@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')

print(cor(df, method="spearman"))
##             beta_mu     beta_pi
## beta_mu  1.00000000 -0.05301733
## beta_pi -0.05301733  1.00000000

ZINB simulation

25% percentage of zeros

core = core25
zinb = zinb25
simData = simData25
simModel = simModel25
obs_detection_rate <- colMeans(core>0)
obs_coverage <- colSums(core)
sim_detection_rate <- rowMeans(simData$counts>0)
sim_coverage <- rowSums(simData$counts)

par(mfrow=c(2, 2))
plot(rowMeans(getPi(zinb)), obs_detection_rate,
     xlab="Average fitted Pi", ylab="Detection Rate",
     pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(zinb))), log1p(obs_coverage), xlim = c(5, 8),
     xlab="Average fitted log Mu", ylab="log Coverage", pch=19, ylim = c(11,15),
     col=cols[bio])
plot(rowMeans(getPi(simModel)), sim_detection_rate,
     xlab="Average simulated Pi", ylab="True Detection Rate",
     pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(simModel))), log1p(sim_coverage), xlim = c(5, 8),
     xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
     col=cols[bio],ylim = c(11,15))

pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(simModel))), colMeans(getPi(simModel)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
              xlab = "Mean Expression", main = 'Simulated',
              ylab = "Dropout Rate",ylim = c(0,1),
              colramp = pal)

50% percentage of zeros

core = core5
zinb = zinb5
simData = simData5
simModel = simModel5
obs_detection_rate <- colMeans(core>0)
obs_coverage <- colSums(core)
sim_detection_rate <- rowMeans(simData$counts>0)
sim_coverage <- rowSums(simData$counts)

par(mfrow=c(2, 2))
plot(rowMeans(getPi(zinb)), obs_detection_rate,
     xlab="Average fitted Pi", ylab="Detection Rate",
     pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(zinb))), log1p(obs_coverage), xlim = c(4, 7),
     xlab="Average fitted log Mu", ylab="log Coverage", pch=19, ylim = c(9,14),
     col=cols[bio])
plot(rowMeans(getPi(simModel)), sim_detection_rate,
     xlab="Average simulated Pi", ylab="True Detection Rate",
     pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(simModel))), log1p(sim_coverage), xlim = c(4, 7),
     xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
     col=cols[bio],ylim = c(9,14))

pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(simModel))), colMeans(getPi(simModel)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
              xlab = "Mean Expression", main = 'Simulated',
              ylab = "Dropout Rate",ylim = c(0,1),
              colramp = pal)

75% percentage of zeros

core = core75
zinb = zinb75
simData = simData75
simModel = simModel75
obs_detection_rate <- colMeans(core>0)
obs_coverage <- colSums(core)
sim_detection_rate <- rowMeans(simData$counts>0)
sim_coverage <- rowSums(simData$counts)

par(mfrow=c(2, 2))
plot(rowMeans(getPi(zinb)), obs_detection_rate,
     xlab="Average fitted Pi", ylab="Detection Rate",
     pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(zinb))), log1p(obs_coverage), xlim = c(3, 7),
     xlab="Average fitted log Mu", ylab="log Coverage", pch=19, ylim = c(10,14),
     col=cols[bio])
plot(rowMeans(getPi(simModel)), sim_detection_rate,
     xlab="Average simulated Pi", ylab="True Detection Rate",
     pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(simModel))), log1p(sim_coverage), xlim = c(3, 7),
     xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
     col=cols[bio],ylim = c(10,14))

pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(simModel))), colMeans(getPi(simModel)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
              xlab = "Mean Expression", main = 'Simulated',
              ylab = "Dropout Rate",ylim = c(0,1),
              colramp = pal)

10 replicates

B = 10
bio = as.vector(bio)
originalCounts = core5
simModel = simModel5
simData = lapply(seq_len(B), function(i) zinbSim(simModel, seed=i))
save(originalCounts, bio, simModel, simData, file = "./datasets/sim_allen5.rda")
zero5 = sapply(simData, function(x) x$zeroFraction)
originalCounts = core25
simModel = simModel25
simData = lapply(seq_len(B), function(i) zinbSim(simModel, seed=i))
save(originalCounts, bio, simModel, simData, file = "./datasets/sim_allen25.rda")
zero25 = sapply(simData, function(x) x$zeroFraction)
originalCounts = core75
simModel = simModel75
simData = lapply(seq_len(B), function(i) zinbSim(simModel, seed=i))
save(originalCounts, bio, simModel, simData, file = "./datasets/sim_allen75.rda")
zero75 = sapply(simData, function(x) x$zeroFraction)
zeroFrac = data.frame(zero = c(zero5, zero25, zero75),
                      perc = c(rep(.5, 10), rep(.25, 10), rep(.75, 10)))
zeroFrac$perc = factor(zeroFrac$perc)
zeroFrac = melt(zeroFrac)
ggplot(zeroFrac, aes(x = perc, y = value)) + geom_boxplot() + 
  ylab('zero fraction') + xlab('wanted zero fraction') + ggtitle('one boxplot = 10 replicates')